# -*- coding: utf-8 -*-
"""
Created on Thu Aug  4 11:14:21 2022

@author: Wieczorek_W_Station
"""

import pandas as pd, os, re, numpy as np
from gensim import parsing

root = "C:\\Users\\Wieczorek_W_Station\\Dropbox\\Andere Aufgaben\\Malzahn\\"
path = root + "tarim-brahmi_database\\"
data = root + "word_corpus_data\\"
architectural_data = root + "Architectural_terms\\"
regional_data = root + "regional_data\\"
output = root + "mca_ready_data\\"

#%%
os.chdir(data)

# =============================================================================
# load classified nouns
# =============================================================================
noun_df = pd.read_excel('nouns_tocharian_a_and_b_classified.xlsx',
                       engine = "openpyxl").drop(columns = "Unnamed: 0")

# =============================================================================
# load text_fragments 
# =============================================================================

fragments_df = pd.read_excel("tocharian_text_fragments.xlsx").\
    drop(columns = "Unnamed: 0")
    
fragments_df.prim_trans = [x.lower() if type(x) == str \
                           else x for x in fragments_df.prim_trans]
    
# =============================================================================
# load architecutral term
# =============================================================================
os.chdir(architectural_data)
architecture_df = pd.read_excel("freagments_architectural terms.xlsx")
# =============================================================================
# load further regional data
# =============================================================================
os.chdir(regional_data)
regional_df = pd.read_excel("Tocharisch Fundorte_combined.xlsx")
#%%

# =============================================================================
# prepare architectural data
# =============================================================================

## change meanings
architecture_df.meaning = [re.sub(" *\?","",x) for x in architecture_df.meaning]


architecture_df_grouped = architecture_df.\
    groupby(by=["IOL Toch 124","meaning"]).size().\
        reset_index()

meanings = list(set(architecture_df_grouped.meaning))

for m in meanings:
    fragments_df[m] = 0

for i, index in zip(fragments_df.idno, fragments_df.index):
    # fragments_df.idno[0] in (list(architecture_df_grouped["IOL Toch 124"]))

    d = architecture_df_grouped[architecture_df_grouped["IOL Toch 124"] == i]
    
    keys = [x for x in d.meaning]
    
    print(i,index,keys)
    for k in keys:
        fragments_df.loc[index,k] = 1


#%%
# =============================================================================
# group architectural terms further
# =============================================================================
"""
building = estate, dwelling, house, roof / dwelling, hut
path/way/road = path/way/road, street
house_parts = platform, roof, tile/brick, hall, gate, garden, balcony, wall, door
village/city/town = city/town, village
"""    
# monks_monastary = [x for x in fragments_df.prim_trans if type(x) == str and
#         re.search("(monastery|monk|nun|priest|sage|wise one|donor)",x)]

monks_monastary = "(monastery|monk|nun|priest|sage|wise one|donor|gift|" +\
    "donation|disciple|monastic)"
gods_buddha = "( god[s]* |brahmā|brahman|omniscents|" + \
    "buddha|bhodda|bhoddi|dharma|arhat)"
demons_spirits = "(ghosts|spirits|nāga|asura|yakṣa|kumbhāṇḍa|" + \
    "kiṃnaras|gandharva|monster|demon)"
hell = "hell"
heaven = "(heaven|tuṣita)" # heaven and hell is thought as "space"
royalty = "(king|queen|prince|reign|royal|noble)" # lord [a-z]+ lord immer, es sei denn buddha_lord
ministers_public_servants = "(eunuch|minister|chancellor|tax collector|" + \
    "assembly|laypeople)" #assembly as social pracitice
military = "(military|army|general|weapons|sword|warrior|fighters)"


settlement = "(town|city|streets|settelment)"
palace = "(palace|ornament|castle|throne|harem)"
housing = "(house|moat|dwelling|borough|estate)"
building_parts = "(roof|hut|brick|wall|hall|gate|garden|grove|" + \
    "balcony|door|rafter)"
    
    
landscapes = "(island|ocean|lake|sea of|forest|mountain|tree|mount [a-z]+|" + \
    "in the lands of|upstream|downstream|stream|" + \
        "northern|southern|western|eastern)"
travel = "(ship|vessel|karawan|caravan|other country|" + \
    "from country to country |continents|travel to the different countries)"

signum = "(finger measure|finger-measure)"
economy = "(merchant|coins|sell|sold|buy|bought|business|" + \
    "obtaining property|trade|increasing the livestock|rich ones|revenue|" + \
    "expense|picul[s]*|peck[s]*|debt|" + \
        "profit|contract|denarius)"

householder = "(householder|housekeeper)" 

jewels_methaphors = "(jewel|gem|crystal)" 
# can be interpreted as methaphor for spiritual beings, e.g. Buddha as jewel
# dharma as jewel
# palaces (heavenly / earthly) can also be related to jewels, gems etc.

worldy_goods = "(possessions|cotton|cloak|mantle|livestock|" +\
    "silk)"

list_of_terms = [monks_monastary,
                 gods_buddha,
                 demons_spirits,
                 hell,
                 heaven,
                 royalty,
                 ministers_public_servants,
                 military,
                 settlement,
                 palace,
                 housing,
                 building_parts,
                 landscapes,
                 travel,
                 economy,
                 signum,
                 householder,
                 jewels_methaphors,
                 worldy_goods]

headers = ["monks_monastary",
                 "gods_buddha",
                 "demons_spirits",
                 "hell",
                 "heaven",
                 "royalty",
                 "ministers_public_servants",
                 "military",
                 "settlement",
                 "palace",
                 "housing",
                 "building_parts",
                 "landscapes",
                 "travel",
                 "economy",
                 "signum",
                 "householder",
                 "jewels_methaphors",
                 "worldly_goods"]

# =============================================================================
# assign values (1 = found, 0 = not found in fragment)
# =============================================================================
for header,pattern in zip(headers,list_of_terms):
    # print(header)
    fragments_df[header] = [1 if re.search(pattern,str(x).lower()) else 0 for x \
                             in fragments_df.prim_trans]

        
print(fragments_df[headers].sum())
#%%
# =============================================================================
# create variables stemming from noun_df
# =============================================================================
keys = [x for x in noun_df.keys() if x not in ["lemma","meaning","keywords",
                                        "language","pos_class",
                                        "alternative_other_tocharian",
                                        "loan_source"]]

## clean noun_df furhter
noun_df["lemma_len"] = [len(x) for x in noun_df.lemma]
noun_df = noun_df[(noun_df.lemma_len > 2)]
noun_df = noun_df.drop(columns= "lemma_len")

count = 0
row = fragments_df.iloc[count]

for k in keys:
    print(k)
    fragments_df[k] = 0
#%%
fragments_df["region_code"] = 0

for k in regional_df.keys():
    d = list(regional_df[k])
    d = [x for x in d if type(x) == str]

    index_no = fragments_df[fragments_df.region.isin(d)].index

    fragments_df.loc[index_no, "region_code"] = k
#%%
# =============================================================================
# save data
# =============================================================================
os.chdir(output)

fragments_df.to_excel("fragments_architecture_combined.xlsx")

